{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "## 2.3.1基于python的语料库的预处理\n",
    "这里所说的预处理是指，将文本分割为单词（分词），并将分割后的单词列表转化为单词 ID 列表。"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:02:49.793180900Z",
     "start_time": "2023-05-04T09:02:49.784162200Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "'\\n这里我们使用由单个句子构成的文本作为语料库。本来文本（text）应该包含成千上万个（连续的）句子，但是，考虑到简洁性，这里先对这个小的文本数据进行预处理。\\n'"
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = 'You say goodbye and I say hello.'\n",
    "\"\"\"\n",
    "这里我们使用由单个句子构成的文本作为语料库。本来文本（text）应该包含成千上万个（连续的）句子，但是，考虑到简洁性，这里先对这个小的文本数据进行预处理。\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [
    "text = text.lower()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:03:11.614151300Z",
     "start_time": "2023-05-04T09:03:11.596151Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [],
   "source": [
    "text = text.replace('.', ' .')"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:03:39.735413300Z",
     "start_time": "2023-05-04T09:03:39.730414700Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "data": {
      "text/plain": "'you say goodbye and i say hello .'"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:03:44.066224600Z",
     "start_time": "2023-05-04T09:03:44.055223700Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [],
   "source": [
    "words = text.split(' ')"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:03:59.799805Z",
     "start_time": "2023-05-04T09:03:59.788804200Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [
    {
     "data": {
      "text/plain": "['you', 'say', 'goodbye', 'and', 'i', 'say', 'hello', '.']"
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:04:02.393765500Z",
     "start_time": "2023-05-04T09:04:02.378760700Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "首先，使用 lower() 方法将所有字母转化为小写，这样可以将句子开头的单词也作为常规单词处理。然后，将空格作为分隔符，通过 split(' ') 切分句子。考虑到句子结尾处的句号（.），我们先在句号前插入一个空格（即用“ .”替换“.”），再进行分词。\n",
    "\"\"\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [],
   "source": [
    "word_to_id = {}\n",
    "id_to_word = {}"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:05:18.301076800Z",
     "start_time": "2023-05-04T09:05:18.296079700Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [],
   "source": [
    "for word in words:\n",
    "    if word not in word_to_id:\n",
    "        new_id = len(word_to_id)\n",
    "        word_to_id[word] = new_id\n",
    "        id_to_word[new_id] = word"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:06:41.939625Z",
     "start_time": "2023-05-04T09:06:41.923626700Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [
    {
     "data": {
      "text/plain": "{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}"
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "id_to_word"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:06:45.900627400Z",
     "start_time": "2023-05-04T09:06:45.876581900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [
    {
     "data": {
      "text/plain": "{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}"
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word_to_id"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:06:49.553782500Z",
     "start_time": "2023-05-04T09:06:49.544782300Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [
    {
     "data": {
      "text/plain": "'say'"
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "id_to_word[1]"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:18:12.471795100Z",
     "start_time": "2023-05-04T09:18:12.460794300Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [
    {
     "data": {
      "text/plain": "5"
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word_to_id['hello']"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:18:17.547447900Z",
     "start_time": "2023-05-04T09:18:17.535447900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "将单词列表转化为单词 ID 列表\n",
    "\"\"\"\n",
    "import numpy as np\n",
    "\n",
    "corpus = [word_to_id[w] for w in words]\n",
    "corpus = np.array(corpus)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:19:25.549661100Z",
     "start_time": "2023-05-04T09:19:25.404536900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [
    {
     "data": {
      "text/plain": "array([0, 1, 2, 3, 4, 1, 5, 6])"
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corpus"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:19:26.978875700Z",
     "start_time": "2023-05-04T09:19:26.953764500Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "我们就完成了利用语料库的准备工作。现在，我们将上述一系列处理实现为 preprocess() 函数\n",
    "\"\"\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "outputs": [],
   "source": [
    "def preprocess(text):\n",
    "    text = text.lower()\n",
    "    text = text.replace('.', ' .')\n",
    "    words = text.split(' ')\n",
    "\n",
    "    word_to_id = {}\n",
    "    id_to_word = {}\n",
    "    for word in words:\n",
    "        if word not in word_to_id:\n",
    "            new_id = len(word_to_id)\n",
    "            word_to_id[word] = new_id\n",
    "            id_to_word[new_id] = word\n",
    "\n",
    "    corpus = np.array([word_to_id[w] for w in words])\n",
    "\n",
    "    return corpus, word_to_id, id_to_word"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:20:44.116984Z",
     "start_time": "2023-05-04T09:20:44.111984400Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "outputs": [],
   "source": [
    "text = 'You say goodbye and I say hello.'\n",
    "corpus, word_to_id, id_to_word = preprocess(text)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:20:59.203923700Z",
     "start_time": "2023-05-04T09:20:59.189922300Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 2.3.2 单词的分布式表示\n",
    "\n",
    "单词的分布式表示将单词表示为固定长度的向量。这种向量的特征在于它是用密集向量表示的。密集向量的意思是，向量的各个元 素（大 多 数）是 由 非 0 实数表示的。例如，三维分布式表示是[0.21,-0.45,0.83]\n",
    "\n",
    "## 2.3.4 共现矩阵\n",
    "\n",
    "我们来考虑如何基于分布式假设使用向量表示单词，最直截了当的实现方法是对周围单词的数量进行计数。具体来说，在关注某个单词的情况下，对它的周围出现了多少次什么单词进行计数，然后再汇总。这里，我们将这种做法称为“基于计数的方法”，在有的文献中也称为“基于统计的方法”。"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "sys.path.append('..')"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:30:19.493466900Z",
     "start_time": "2023-05-04T09:30:19.483466900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "outputs": [],
   "source": [
    "from common.util import preprocess"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:30:26.052807900Z",
     "start_time": "2023-05-04T09:30:26.034808100Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "outputs": [],
   "source": [
    "text = 'You say goodbye and I say hello.'\n",
    "corpus, word_to_id, id_to_word = preprocess(text)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:30:40.378753Z",
     "start_time": "2023-05-04T09:30:40.366751900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0 1 2 3 4 1 5 6]\n"
     ]
    }
   ],
   "source": [
    "print(corpus)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:30:46.354049700Z",
     "start_time": "2023-05-04T09:30:46.342048800Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}\n"
     ]
    }
   ],
   "source": [
    "print(id_to_word)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:30:58.707928400Z",
     "start_time": "2023-05-04T09:30:58.690928400Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "单词 you 的上下文仅有 say 这个单词,这也意味着可以用向量 [0, 1, 0, 0, 0, 0, 0] 表示单词 you,接着对单词 ID 为 1 的 say 进行同样的处理,单词 say 可以表示为向量 [1, 0, 1, 0, 1, 1, 0],汇总了所有单词的共现单词的表格。这个表格的各行对应相应单词的向量。因为表格呈矩阵状，所以称为共现矩阵"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "outputs": [],
   "source": [
    "C = np.array([\n",
    "    [0, 1, 0, 0, 0, 0, 0],\n",
    "    [1, 0, 1, 0, 1, 1, 0],\n",
    "    [0, 1, 0, 1, 0, 0, 0],\n",
    "    [0, 0, 1, 0, 1, 0, 0],\n",
    "    [0, 1, 0, 1, 0, 0, 0],\n",
    "    [0, 1, 0, 0, 0, 0, 1],\n",
    "    [0, 0, 0, 0, 0, 1, 0],\n",
    "], dtype=np.int32)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:46:28.726365Z",
     "start_time": "2023-05-04T09:46:28.695295100Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0 1 0 0 0 0 0]\n"
     ]
    }
   ],
   "source": [
    "print(C[0])  # 单词ID为0的向量"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:46:36.379233200Z",
     "start_time": "2023-05-04T09:46:36.366236600Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0 1 0 1 0 0 0]\n"
     ]
    }
   ],
   "source": [
    "print(C[4])  # 单词ID为4的向量"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:46:41.975022400Z",
     "start_time": "2023-05-04T09:46:41.969025300Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0 1 0 1 0 0 0]\n"
     ]
    }
   ],
   "source": [
    "print(C[word_to_id['goodbye']])  # goodbye的向量"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T09:46:50.100375500Z",
     "start_time": "2023-05-04T09:46:50.094371200Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "下面，我们来实现一个能直接从语料库生成共现矩阵的函数。我们把这个函数称为 create_co_matrix(corpus, vocab_size,window_size=1)，其中参数 corpus 是单词 ID 列表，参数 vocab_size 是词汇个数，window_size 是窗口大小\n",
    "\"\"\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "outputs": [],
   "source": [
    "def create_co_matrix(corpus, vocab_size, window_size=1):\n",
    "    corpus_size = len(corpus)\n",
    "    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)\n",
    "\n",
    "    for idx, word_id in enumerate(corpus):\n",
    "        # 通过窗口长度确认上下文词的下标\n",
    "        for i in range(1, window_size + 1):\n",
    "            left_idx = idx - i\n",
    "            right_idx = idx + i\n",
    "            # 将当前词的向量在上下午的词的位置上+1\n",
    "            if left_idx >= 0:\n",
    "                left_word_id = corpus[left_idx]\n",
    "                co_matrix[word_id, left_word_id] += 1\n",
    "\n",
    "            if right_idx < corpus_size:\n",
    "                right_word_id = corpus[right_idx]\n",
    "                co_matrix[word_id, right_word_id] += 1\n",
    "    return co_matrix"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T10:12:28.350267400Z",
     "start_time": "2023-05-04T10:12:28.334269Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "首先，用元素为 0 的二维数组对 co_matrix 进行初始化。然后，针对语料库中的每一个单词，计算它的窗口中包含的单词。同时，检查窗口内的单词是否超出了语料库的左端和右端，之后，我们都将使用这个函数生成共现矩阵\n",
    "\"\"\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 2.3.5 向量间的相似度\n",
    "\n",
    "测量向量间的相似度有很多方法，其中具有代表性的方法有向量内积或欧式距离等。虽然除此之外还有很多方法，但是在测量单词的向量表示的相似度方面，余弦相似度（cosine similarity）是很常用的。\n",
    "\n",
    "分子是向量内积，分母是各个向量的范数。范数表示向量的大小，这里计算的是 L2 范数（即向量各个元素的平方和的平方根）。\n",
    "\n",
    "余弦相似度直观地表示了“两个向量在多大程度上指向同一方向”。两个向量完全指向相同的方向时，余弦相似度为 1；完全指向相反的方向时，余弦相似度为 −1。"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "outputs": [],
   "source": [
    "def cos_similarity(x, y):\n",
    "    nx = x / np.sqrt(np.sum(x ** 2))  # x的正规化\n",
    "    ny = y / np.sqrt(np.sum(y ** 2))  # y的正规化\n",
    "    return np.dot(nx, ny)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T10:19:21.947594100Z",
     "start_time": "2023-05-04T10:19:21.940593200Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "这里，我们假定参数 x 和 y 是 NumPy 数组。首先对向量进行正规化，然后求两个向量的内积。这里余弦相似度的实现虽然完成了，但是还有一个问题。那就是当零向量（元素全部为 0 的向量）被赋值给参数时，会出现“除数为 0”（zero division）的错误。解决此类问题的一个常用方法是，在执行除法时加上一个微小值。这里，通过参数指定一个微小值 eps（eps 是 epsilon 的缩写），并默认 eps=1e-8（= 0.000 000 01）。这样修改后的余弦相似度的实现如下所示\n",
    "\"\"\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "outputs": [],
   "source": [
    "def cos_similarity(x, y, eps=1e-8):\n",
    "    nx = x / (np.sqrt(np.sum(x ** 2)) + eps)\n",
    "    ny = y / (np.sqrt(np.sum(y ** 2)) + eps)\n",
    "    return np.dot(nx, ny)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T10:20:02.468293500Z",
     "start_time": "2023-05-04T10:20:02.449283200Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "这里我们用了 1e-8作为微小值，在这么小的值的情况下，根据浮点数的舍入误差，这个微小值会被其他值“吸收”掉。在上面的实现中，因为这个微小值会被向量的范数“吸收”掉，所以在绝大多数情况下，加上 eps不会对最终的计算结果造成影响。而当向量的范数为 0 时，这个微小值可以防止“除数为 0”的错误。\n",
    "\"\"\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "sys.path.append('..')\n",
    "from common.util import preprocess, create_co_matrix, cos_similarity"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T10:21:24.854761400Z",
     "start_time": "2023-05-04T10:21:24.842758400Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "outputs": [],
   "source": [
    "text = 'You say goodbye and I say hello.'"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T10:21:47.560154900Z",
     "start_time": "2023-05-04T10:21:47.532153300Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "outputs": [],
   "source": [
    "corpus, word_to_id, id_to_word = preprocess(text)\n",
    "vocab_size = len(word_to_id)\n",
    "C = create_co_matrix(corpus, vocab_size)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T10:22:01.638815300Z",
     "start_time": "2023-05-04T10:22:01.606791100Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7071067691154799\n"
     ]
    }
   ],
   "source": [
    "c0 = C[word_to_id['you']]  # you的单词向量\n",
    "c1 = C[word_to_id['i']]  # i的单词向量\n",
    "print(cos_similarity(c0, c1))"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T10:22:18.892011900Z",
     "start_time": "2023-05-04T10:22:18.860486400Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 2.3.6 相似单词的排序\n",
    "\n",
    "余弦相似度已经实现好了，使用这个函数，我们可以实现另一个便利的函数：当某个单词被作为查询词时，将与这个查询词相似的单词按降序显示出来。这里将这个函数称为 most_similar()，通过下列参数进行实现。\n",
    "most_similar(query, word_to_id, id_to_word, word_matrix, top=5)\n",
    "\n",
    "query       查询词\n",
    "word_to_id  单词到单词 ID 的字典\n",
    "id_to_word  单词 ID 到单词的字典\n",
    "word_matrix 汇总了单词向量的矩阵，假定保存了与各行对应的单词向量\n",
    "top         显示到前几位"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "outputs": [],
   "source": [
    "def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):\n",
    "    # 取出查询词\n",
    "    if query not in word_to_id:\n",
    "        print('%s is not found' % query)\n",
    "        return\n",
    "\n",
    "    print('\\n[query] ' + query)\n",
    "    query_id = word_to_id[query]\n",
    "    query_vec = word_matrix[query_id]\n",
    "\n",
    "    # 计算余弦相似度\n",
    "    vocab_size = len(id_to_word)\n",
    "    similarity = np.zeros(vocab_size)\n",
    "    for i in range(vocab_size):\n",
    "        similarity[i] = cos_similarity(word_matrix[i], query_vec)\n",
    "\n",
    "    # 基于余弦相似度，按降序输出值\n",
    "    count = 0\n",
    "    for i in (-1 * similarity).argsort():\n",
    "        if id_to_word[i] == query:\n",
    "            continue\n",
    "        print(' %s: %s' % (id_to_word[i], similarity[i]))\n",
    "        count += 1\n",
    "        if count >= top:\n",
    "            return"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T10:49:06.234845800Z",
     "start_time": "2023-05-04T10:49:06.218845900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "sys.path.append('..')\n",
    "from common.util import preprocess, create_co_matrix, most_similar"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T10:49:15.057247Z",
     "start_time": "2023-05-04T10:49:15.037246Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "outputs": [],
   "source": [
    "text = 'You say goodbye and I say hello.'\n",
    "corpus, word_to_id, id_to_word = preprocess(text)\n",
    "vocab_size = len(word_to_id)\n",
    "C = create_co_matrix(corpus, vocab_size)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T10:49:29.753357Z",
     "start_time": "2023-05-04T10:49:29.738357100Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "[query] you\n",
      " goodbye: 0.7071067691154799\n",
      " i: 0.7071067691154799\n",
      " hello: 0.7071067691154799\n",
      " say: 0.0\n",
      " and: 0.0\n"
     ]
    }
   ],
   "source": [
    "most_similar('you', word_to_id, id_to_word, C, top=5)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T10:49:38.027607400Z",
     "start_time": "2023-05-04T10:49:38.007606600Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
